rm(list = ls())
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.6.8 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(ggcorrplot)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded glmnet 2.0-16
## 
## Attaching package: 'glmnet'
## The following object is masked from 'package:pROC':
## 
##     auc
library(plotly)
set.seed(1)

Motivation

The goal for our final project was to build a prediction model using the LendingClub loans data set. We wanted to build a model that could predict the a dichotomized outcome of loan status (which will be explained in more detail below) using the variables given in the data set. In this web page, we will walk you through our process for building our final prediction model.

Data Exploration

Lara, can you explain where and how you got the data set?

setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
load('loan.Rdata')
#loan.dat <- read.csv('loan.csv', header = TRUE)
#save(loan.dat, file = "./loan.RData")

#extract.3000 <- sample(1:dim(loan.dat)[1], 3000, replace = FALSE)
#write.csv(loan.dat[extract.3000, ], './loan_subset3000.csv')

The data set has 887,379 records with 74 variables.

dim(loan.dat)
## [1] 887379     74
names(loan.dat)
##  [1] "id"                          "member_id"                  
##  [3] "loan_amnt"                   "funded_amnt"                
##  [5] "funded_amnt_inv"             "term"                       
##  [7] "int_rate"                    "installment"                
##  [9] "grade"                       "sub_grade"                  
## [11] "emp_title"                   "emp_length"                 
## [13] "home_ownership"              "annual_inc"                 
## [15] "verification_status"         "issue_d"                    
## [17] "loan_status"                 "pymnt_plan"                 
## [19] "url"                         "desc"                       
## [21] "purpose"                     "title"                      
## [23] "zip_code"                    "addr_state"                 
## [25] "dti"                         "delinq_2yrs"                
## [27] "earliest_cr_line"            "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "collections_12_mths_ex_med" 
## [51] "mths_since_last_major_derog" "policy_code"                
## [53] "application_type"            "annual_inc_joint"           
## [55] "dti_joint"                   "verification_status_joint"  
## [57] "acc_now_delinq"              "tot_coll_amt"               
## [59] "tot_cur_bal"                 "open_acc_6m"                
## [61] "open_il_6m"                  "open_il_12m"                
## [63] "open_il_24m"                 "mths_since_rcnt_il"         
## [65] "total_bal_il"                "il_util"                    
## [67] "open_rv_12m"                 "open_rv_24m"                
## [69] "max_bal_bc"                  "all_util"                   
## [71] "total_rev_hi_lim"            "inq_fi"                     
## [73] "total_cu_tl"                 "inq_last_12m"
meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
meta_loans[order(-meta_loans$p_na),]

As part of data exploration, we examined with percentage of “zeros”, missing records, and unique values in our data set per variable as shown above. From the table above, we notice quite a variables with a lot of missing data.

Based on examining the data set and reading the data dictionary, we decided to immediately rule out the following variables from our model: id, member_id, url, and desc.

cols.2.remove <- c('id', 'member_id', 'url', 'desc')

We decided to exclude variables with more than 10% missing data (19 variables).

missing.data.col <- meta_loans$variable[meta_loans$p_na > 10.]
missing.data.col
##  [1] "mths_since_last_delinq"      "mths_since_last_record"     
##  [3] "mths_since_last_major_derog" "annual_inc_joint"           
##  [5] "dti_joint"                   "open_acc_6m"                
##  [7] "open_il_6m"                  "open_il_12m"                
##  [9] "open_il_24m"                 "mths_since_rcnt_il"         
## [11] "total_bal_il"                "il_util"                    
## [13] "open_rv_12m"                 "open_rv_24m"                
## [15] "max_bal_bc"                  "all_util"                   
## [17] "inq_fi"                      "total_cu_tl"                
## [19] "inq_last_12m"
length(missing.data.col)
## [1] 19
cols.2.remove <- c(cols.2.remove, missing.data.col)
meta_loans[order(meta_loans$unique),]
cols.2.remove <- c(cols.2.remove, 'policy_code')

We also decided to remove policy_code since it only has one unique value.

At this point, we had 50 potential covariates:

cols.2.keep <- !(colnames(loan.dat) %in% cols.2.remove)
colnames(loan.dat)[cols.2.keep]
##  [1] "loan_amnt"                  "funded_amnt"               
##  [3] "funded_amnt_inv"            "term"                      
##  [5] "int_rate"                   "installment"               
##  [7] "grade"                      "sub_grade"                 
##  [9] "emp_title"                  "emp_length"                
## [11] "home_ownership"             "annual_inc"                
## [13] "verification_status"        "issue_d"                   
## [15] "loan_status"                "pymnt_plan"                
## [17] "purpose"                    "title"                     
## [19] "zip_code"                   "addr_state"                
## [21] "dti"                        "delinq_2yrs"               
## [23] "earliest_cr_line"           "inq_last_6mths"            
## [25] "open_acc"                   "pub_rec"                   
## [27] "revol_bal"                  "revol_util"                
## [29] "total_acc"                  "initial_list_status"       
## [31] "out_prncp"                  "out_prncp_inv"             
## [33] "total_pymnt"                "total_pymnt_inv"           
## [35] "total_rec_prncp"            "total_rec_int"             
## [37] "total_rec_late_fee"         "recoveries"                
## [39] "collection_recovery_fee"    "last_pymnt_d"              
## [41] "last_pymnt_amnt"            "next_pymnt_d"              
## [43] "last_credit_pull_d"         "collections_12_mths_ex_med"
## [45] "application_type"           "verification_status_joint" 
## [47] "acc_now_delinq"             "tot_coll_amt"              
## [49] "tot_cur_bal"                "total_rev_hi_lim"
length(colnames(loan.dat)[cols.2.keep])
## [1] 50
loan.dat <- loan.dat[, cols.2.keep]

We also decided to remove 6 records with missing or zero annual income since this is a covariate we definitely wanted to include in our model (and didn’t feel we could impute these values properly)!

query = loan.dat$annual_inc == 0.
query.na = is.na(query)
if (sum(query.na) > 0){
  query[query.na] = TRUE
}
if (sum(query) > 0){
  loan.dat = loan.dat[!query,]
} else stop('unexpected case')

We the remaining set of records and covariates, we decided to examine the pairwise correlation of covariates:

meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
numeric_cols <- meta_loans$variable[meta_loans$type == 'numeric']

cor.dat <- cor(loan.dat[,numeric_cols], loan.dat[,numeric_cols])
plot_ly(x=colnames(cor.dat), 
        y=rownames(cor.dat), 
        z = cor.dat, type = "heatmap", colorscale="Greys")
#ggcorrplot(cor(loan.dat[,numeric_cols]))
#aggr(loan.dat, combined=T, cex.axis=0.6)

We also calculated basic summary statistics of our covariates to help us better understand the data:

summary(loan.dat)
##    loan_amnt      funded_amnt    funded_amnt_inv         term       
##  Min.   :  500   Min.   :  500   Min.   :    0    36 months:621119  
##  1st Qu.: 8000   1st Qu.: 8000   1st Qu.: 8000    60 months:266254  
##  Median :13000   Median :13000   Median :13000                      
##  Mean   :14755   Mean   :14742   Mean   :14703                      
##  3rd Qu.:20000   3rd Qu.:20000   3rd Qu.:20000                      
##  Max.   :35000   Max.   :35000   Max.   :35000                      
##                                                                     
##     int_rate      installment      grade        sub_grade     
##  Min.   : 5.32   Min.   :  15.67   A:148198   B3     : 56323  
##  1st Qu.: 9.99   1st Qu.: 260.71   B:254535   B4     : 55626  
##  Median :12.99   Median : 382.55   C:245859   C1     : 53387  
##  Mean   :13.25   Mean   : 436.72   D:139541   C2     : 52235  
##  3rd Qu.:16.20   3rd Qu.: 572.60   E: 70705   C3     : 50161  
##  Max.   :28.99   Max.   :1445.46   F: 23046   C4     : 48857  
##                                    G:  5489   (Other):570784  
##             emp_title          emp_length      home_ownership  
##                  : 51451   10+ years:291569   ANY     :     3  
##  Teacher         : 13469   2 years  : 78870   MORTGAGE:443555  
##  Manager         : 11240   < 1 year : 70601   NONE    :    46  
##  Registered Nurse:  5525   3 years  : 70026   OTHER   :   182  
##  Owner           :  5376   1 year   : 57095   OWN     : 87470  
##  RN              :  5355   5 years  : 55704   RENT    :356117  
##  (Other)         :794957   (Other)  :263508                    
##    annual_inc           verification_status     issue_d      
##  Min.   :   1200   Not Verified   :266744   Oct-2015: 48631  
##  1st Qu.:  45000   Source Verified:329558   Jul-2015: 45962  
##  Median :  65000   Verified       :291071   Dec-2015: 44341  
##  Mean   :  75028                            Oct-2014: 38782  
##  3rd Qu.:  90000                            Nov-2015: 37529  
##  Max.   :9500000                            Aug-2015: 35886  
##                                             (Other) :636242  
##              loan_status     pymnt_plan               purpose      
##  Current           :601777   n:887363   debt_consolidation:524214  
##  Fully Paid        :207723   y:    10   credit_card       :206181  
##  Charged Off       : 45248              home_improvement  : 51829  
##  Late (31-120 days): 11591              other             : 42890  
##  Issued            :  8460              major_purchase    : 17277  
##  In Grace Period   :  6253              small_business    : 10377  
##  (Other)           :  6321              (Other)           : 34605  
##                      title           zip_code        addr_state    
##  Debt consolidation     :414000   945xx  :  9770   CA     :129517  
##  Credit card refinancing:164330   750xx  :  9417   NY     : 74082  
##  Home improvement       : 40112   112xx  :  9272   TX     : 71136  
##  Other                  : 31892   606xx  :  8641   FL     : 60935  
##  Debt Consolidation     : 15760   300xx  :  8126   IL     : 35476  
##  Major purchase         : 12051   100xx  :  7605   NJ     : 33256  
##  (Other)                :209228   (Other):834542   (Other):482971  
##       dti           delinq_2yrs      earliest_cr_line  inq_last_6mths   
##  Min.   :   0.00   Min.   : 0.0000   Aug-2001:  6659   Min.   : 0.0000  
##  1st Qu.:  11.91   1st Qu.: 0.0000   Aug-2000:  6529   1st Qu.: 0.0000  
##  Median :  17.65   Median : 0.0000   Oct-2000:  6322   Median : 0.0000  
##  Mean   :  18.13   Mean   : 0.3144   Oct-2001:  6154   Mean   : 0.6946  
##  3rd Qu.:  23.95   3rd Qu.: 0.0000   Aug-2002:  6086   3rd Qu.: 1.0000  
##  Max.   :1092.52   Max.   :39.0000   Sep-2000:  5918   Max.   :33.0000  
##                    NA's   :25        (Other) :849705   NA's   :25       
##     open_acc        pub_rec          revol_bal         revol_util    
##  Min.   : 0.00   Min.   : 0.0000   Min.   :      0   Min.   :  0.00  
##  1st Qu.: 8.00   1st Qu.: 0.0000   1st Qu.:   6443   1st Qu.: 37.70  
##  Median :11.00   Median : 0.0000   Median :  11875   Median : 56.00  
##  Mean   :11.55   Mean   : 0.1953   Mean   :  16921   Mean   : 55.07  
##  3rd Qu.:14.00   3rd Qu.: 0.0000   3rd Qu.:  20829   3rd Qu.: 73.60  
##  Max.   :90.00   Max.   :86.0000   Max.   :2904836   Max.   :892.30  
##  NA's   :25      NA's   :25                          NA's   :498     
##    total_acc      initial_list_status   out_prncp     out_prncp_inv  
##  Min.   :  1.00   f:456843            Min.   :    0   Min.   :    0  
##  1st Qu.: 17.00   w:430530            1st Qu.:    0   1st Qu.:    0  
##  Median : 24.00                       Median : 6459   Median : 6456  
##  Mean   : 25.27                       Mean   : 8403   Mean   : 8400  
##  3rd Qu.: 32.00                       3rd Qu.:13659   3rd Qu.:13654  
##  Max.   :169.00                       Max.   :49373   Max.   :49373  
##  NA's   :25                                                          
##   total_pymnt    total_pymnt_inv total_rec_prncp total_rec_int    
##  Min.   :    0   Min.   :    0   Min.   :    0   Min.   :    0.0  
##  1st Qu.: 1915   1st Qu.: 1900   1st Qu.: 1201   1st Qu.:  441.5  
##  Median : 4895   Median : 4862   Median : 3215   Median : 1073.3  
##  Mean   : 7559   Mean   : 7521   Mean   : 5758   Mean   : 1754.8  
##  3rd Qu.:10617   3rd Qu.:10566   3rd Qu.: 8000   3rd Qu.: 2238.3  
##  Max.   :57778   Max.   :57778   Max.   :35000   Max.   :24205.6  
##                                                                   
##  total_rec_late_fee   recoveries       collection_recovery_fee
##  Min.   :  0.0000   Min.   :    0.00   Min.   :   0.000       
##  1st Qu.:  0.0000   1st Qu.:    0.00   1st Qu.:   0.000       
##  Median :  0.0000   Median :    0.00   Median :   0.000       
##  Mean   :  0.3967   Mean   :   45.92   Mean   :   4.881       
##  3rd Qu.:  0.0000   3rd Qu.:    0.00   3rd Qu.:   0.000       
##  Max.   :358.6800   Max.   :33520.27   Max.   :7002.190       
##                                                               
##    last_pymnt_d    last_pymnt_amnt     next_pymnt_d    last_credit_pull_d
##  Jan-2016:470148   Min.   :    0.0   Feb-2016:553404   Jan-2016:730572   
##  Dec-2015:150861   1st Qu.:  280.2           :252971   Dec-2015: 19308   
##          : 17659   Median :  462.8   Jan-2016: 78195   Nov-2015: 11490   
##  Oct-2015: 16000   Mean   : 2164.2   Mar-2011:   107   Oct-2015: 10419   
##  Jul-2015: 14483   3rd Qu.:  831.2   Apr-2011:   101   Sep-2015: 10087   
##  Nov-2015: 13981   Max.   :36475.6   Feb-2011:    91   Jul-2015:  8642   
##  (Other) :204241                     (Other) :  2504   (Other) : 96855   
##  collections_12_mths_ex_med   application_type 
##  Min.   : 0.00000           INDIVIDUAL:886864  
##  1st Qu.: 0.00000           JOINT     :   509  
##  Median : 0.00000                              
##  Mean   : 0.01438                              
##  3rd Qu.: 0.00000                              
##  Max.   :20.00000                              
##  NA's   :141                                   
##    verification_status_joint acc_now_delinq       tot_coll_amt    
##                 :886864      Min.   : 0.000000   Min.   :      0  
##  Not Verified   :   281      1st Qu.: 0.000000   1st Qu.:      0  
##  Source Verified:    61      Median : 0.000000   Median :      0  
##  Verified       :   167      Mean   : 0.004991   Mean   :    226  
##                              3rd Qu.: 0.000000   3rd Qu.:      0  
##                              Max.   :14.000000   Max.   :9152545  
##                              NA's   :25          NA's   :70272    
##   tot_cur_bal      total_rev_hi_lim 
##  Min.   :      0   Min.   :      0  
##  1st Qu.:  29853   1st Qu.:  13900  
##  Median :  80559   Median :  23700  
##  Mean   : 139458   Mean   :  32069  
##  3rd Qu.: 208205   3rd Qu.:  39800  
##  Max.   :8000078   Max.   :9999999  
##  NA's   :70272     NA's   :70272

Feature Engineering

In our prediction model, we decided to predict loan status. We dichotomized loan status into “Good” and “Bad” based on the following criteria: Good 1. Fully Paid 2. Current 3. Does not meet the credit policy. Status:Fully Paid Bad 1. Default 2. Charged Off
3. Late (16-30 days)
4. Late (31-120 days)
5. In Grace Period 6. Does not meet the credit policy. Status:Charged Off

Since we are predicting loan status using our model, we decided to drop records with loan_status with values Issued:

query <- loan.dat$loan_status != 'Issued'
loan.dat <- loan.dat[query, ]

loan.dat$loan_status_bin <- "Bad"
query = loan.dat$loan_status == 'Fully Paid' | loan.dat$loan_status == 'Current' |
  loan.dat$loan_status == 'Does not meet the credit policy. Status:Fully Paid'
loan.dat$loan_status_bin[query] = 'Good'

loan.dat$loan_status_bin = as.factor(loan.dat$loan_status_bin)

We also converted funded_amnt_inv to be percent funded amount by investors perc_funded_amnt_inv and only use the year from issue date (issue_d):

loan.dat <- loan.dat %>%
  mutate(perc_funded_amnt_inv = funded_amnt_inv/funded_amnt,
         issue_d = as.character(issue_d),
         term = as.character(term)) %>%
  mutate(year = as.numeric(str_sub(issue_d, start = -4)))

We reclassified a 36 month loan to “Short” and and 60 month loan to “Long”.

query <- loan.dat$term == ' 36 months'
loan.dat$term[query] = 'Short'
loan.dat$term[!query] = 'Long'
loan.dat$term = as.factor(loan.dat$term)

We also dichotomized tot_coll_amt to 0 and greater than 0 (and replaced the missing values to zero):

query.na <- is.na(loan.dat$tot_coll_amt)
if (sum(query.na) >0){
  loan.dat$tot_coll_amt[query.na] = 0
}
loan.dat <- loan.dat %>%
  mutate(tot_coll_amt_gt0 = as.factor(tot_coll_amt > 0.))

Based on examining the uni-variate plots and the correlation plot, we decided to keep the following 27 covariates: ### we can probably add a more detailed reason for dropping certain columns as outline in google docs

predictors <- c('loan_amnt', 'funded_amnt',
                'int_rate', 'grade',
                'emp_length', 'home_ownership',
                'annual_inc', 'verification_status',
                'purpose',
                'addr_state', 'dti',
                'delinq_2yrs', 'inq_last_6mths',
                'open_acc', 'pub_rec', 
                'revol_bal', 'revol_util',
                'total_acc', 'initial_list_status',
                'application_type', 'acc_now_delinq',
                'tot_coll_amt_gt0', 'tot_cur_bal',
                'total_rev_hi_lim', 'perc_funded_amnt_inv',
                'term', 'year')
predictors
##  [1] "loan_amnt"            "funded_amnt"          "int_rate"            
##  [4] "grade"                "emp_length"           "home_ownership"      
##  [7] "annual_inc"           "verification_status"  "purpose"             
## [10] "addr_state"           "dti"                  "delinq_2yrs"         
## [13] "inq_last_6mths"       "open_acc"             "pub_rec"             
## [16] "revol_bal"            "revol_util"           "total_acc"           
## [19] "initial_list_status"  "application_type"     "acc_now_delinq"      
## [22] "tot_coll_amt_gt0"     "tot_cur_bal"          "total_rev_hi_lim"    
## [25] "perc_funded_amnt_inv" "term"                 "year"
outcome <- c('loan_status_bin')

loan.dat <- loan.dat[, c(outcome, predictors)]

We also did a simple uni-variate analysis of the covariates using histograms and boxplots:

meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
numeric_cols <- meta_loans$variable[meta_loans$type == 'numeric']

for (col_name in numeric_cols){
  plt <- loan.dat %>% ggplot(aes(loan.dat[, col_name])) +
    geom_histogram(color = "black") + 
    ggtitle(col_name) + labs(x=col_name) 
  print(plt)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 495 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 25 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70272 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 70272 rows containing non-finite values (stat_bin).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

for (col_name in numeric_cols){
  plt <- loan.dat %>% ggplot(aes(x=loan_status_bin, loan.dat[, col_name])) +
    geom_boxplot() + 
    ggtitle(col_name) + labs(x='Loan Status', y=col_name)
  print(plt)
}

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 495 rows containing non-finite values (stat_boxplot).

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 25 rows containing non-finite values (stat_boxplot).

## Warning: Removed 70272 rows containing non-finite values (stat_boxplot).

## Warning: Removed 70272 rows containing non-finite values (stat_boxplot).

We still have a few records with missing data. We initially tried imputing these values using the mice package but due to computational reasons, we decided to drop this idea.

meta_loans <- funModeling::df_status(loan.dat, print_results = FALSE)
meta_loans[order(-meta_loans$p_na),]
# loan.dat = mice(loan.dat, m=1)  # let's just impute one dataset
# loan.dat = complete(loan.dat, 1)

Instead, we decided to set the missing values to the mean using the rest of the data:

sum(is.na(loan.dat$loan_status_bin))
## [1] 0
for (j in 1:ncol(loan.dat)){
  miss = is.na(loan.dat[,j])
  if (sum(miss) > 0){
    loan.dat[miss, j] = mean(loan.dat[,j], na.rm=T)
  }
}
sum(is.na(loan.dat))
## [1] 0

Here is a final basic summary statistics of the remaining covariates:

summary(loan.dat)
##  loan_status_bin   loan_amnt      funded_amnt       int_rate    
##  Bad : 67429     Min.   :  500   Min.   :  500   Min.   : 5.32  
##  Good:811484     1st Qu.: 8000   1st Qu.: 8000   1st Qu.: 9.99  
##                  Median :13000   Median :13000   Median :12.99  
##                  Mean   :14750   Mean   :14737   Mean   :13.25  
##                  3rd Qu.:20000   3rd Qu.:20000   3rd Qu.:16.20  
##                  Max.   :35000   Max.   :35000   Max.   :28.99  
##                                                                 
##  grade          emp_length      home_ownership     annual_inc     
##  A:146750   10+ years:288752   ANY     :     3   Min.   :   1200  
##  B:252006   2 years  : 78167   MORTGAGE:439335   1st Qu.:  45000  
##  C:243387   < 1 year : 69828   NONE    :    46   Median :  64800  
##  D:138356   3 years  : 69344   OTHER   :   182   Mean   :  74996  
##  E: 70112   1 year   : 56547   OWN     : 86432   3rd Qu.:  90000  
##  F: 22852   5 years  : 55194   RENT    :352915   Max.   :9500000  
##  G:  5450   (Other)  :261081                                      
##       verification_status               purpose         addr_state    
##  Not Verified   :263965   debt_consolidation:519418   CA     :128370  
##  Source Verified:326722   credit_card       :204110   NY     : 73360  
##  Verified       :288226   home_improvement  : 51336   TX     : 70445  
##                           other             : 42410   FL     : 60328  
##                           major_purchase    : 17093   IL     : 35178  
##                           small_business    : 10265   NJ     : 32960  
##                           (Other)           : 34281   (Other):478272  
##       dti           delinq_2yrs      inq_last_6mths       open_acc    
##  Min.   :   0.00   Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.00  
##  1st Qu.:  11.90   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 8.00  
##  Median :  17.64   Median : 0.0000   Median : 0.0000   Median :11.00  
##  Mean   :  18.12   Mean   : 0.3141   Mean   : 0.6952   Mean   :11.54  
##  3rd Qu.:  23.93   3rd Qu.: 0.0000   3rd Qu.: 1.0000   3rd Qu.:14.00  
##  Max.   :1092.52   Max.   :39.0000   Max.   :33.0000   Max.   :90.00  
##                                                                       
##     pub_rec          revol_bal         revol_util      total_acc     
##  Min.   : 0.0000   Min.   :      0   Min.   :  0.0   Min.   :  1.00  
##  1st Qu.: 0.0000   1st Qu.:   6446   1st Qu.: 37.7   1st Qu.: 17.00  
##  Median : 0.0000   Median :  11875   Median : 56.0   Median : 24.00  
##  Mean   : 0.1948   Mean   :  16912   Mean   : 55.1   Mean   : 25.26  
##  3rd Qu.: 0.0000   3rd Qu.:  20823   3rd Qu.: 73.6   3rd Qu.: 32.00  
##  Max.   :86.0000   Max.   :2904836   Max.   :892.3   Max.   :169.00  
##                                                                      
##  initial_list_status   application_type  acc_now_delinq     
##  f:456023            INDIVIDUAL:878468   Min.   : 0.000000  
##  w:422890            JOINT     :   445   1st Qu.: 0.000000  
##                                          Median : 0.000000  
##                                          Mean   : 0.004994  
##                                          3rd Qu.: 0.000000  
##                                          Max.   :14.000000  
##                                                             
##  tot_coll_amt_gt0  tot_cur_bal      total_rev_hi_lim  perc_funded_amnt_inv
##  FALSE:764171     Min.   :      0   Min.   :      0   Min.   :0.0000      
##  TRUE :114742     1st Qu.:  32250   1st Qu.:  14700   1st Qu.:1.0000      
##                   Median : 100278   Median :  25800   Median :1.0000      
##                   Mean   : 139394   Mean   :  32035   Mean   :0.9964      
##                   3rd Qu.: 195607   3rd Qu.:  37800   3rd Qu.:1.0000      
##                   Max.   :8000078   Max.   :9999999   Max.   :1.0000      
##                                                                           
##     term             year     
##  Long :263776   Min.   :2007  
##  Short:615137   1st Qu.:2013  
##                 Median :2014  
##                 Mean   :2014  
##                 3rd Qu.:2015  
##                 Max.   :2015  
## 
setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
save(loan.dat, file = "./loan.dat.RData")

Building the Prediction Model

We decided to evaluate three models: 1. Ridge 2. Lasso 3. Elastic Net

In evaluating our model, we picked our final model based on the best test performance (i.e., largest AUC). We kept about a fifth of a data (200,000 records) for testing and used the rest for training our models. We trained our candidate models using 10-fold cross-validation (on the training set) to obtain the optimal tuning parameters and finally tested their performance on the test data set.

# This chunk takes a very very long time to run!!!
# I ran this overnight on my laptop and 
# saved the results
# That's why eval is set to FALSE

setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
load('loan.dat.Rdata')

# keep a fifth of the data set to assess test performance
set.seed(1)
test.indx <- sample(1:dim(loan.dat)[1], 200000, replace = FALSE)
train.indx <- setdiff(1:dim(loan.dat)[1], test.indx)

# this is to debug the code
# small.data.indx <- sample(1:dim(loan.dat)[1], 10000, replace = FALSE)
# loan.dat <- loan.dat[small.data.indx, ]
# test.indx <- sample(1:dim(loan.dat)[1], 500, replace = FALSE)
# train.indx <- setdiff(1:dim(loan.dat)[1], test.indx)

train.control = trainControl(method="repeatedcv", number=10, repeats=1, 
                    classProbs=T, summaryFunction=twoClassSummary)
#models = c("ridge", "lasso", "enet", "adaboost")
models = c("ridge", "lasso", "enet")
n_models = length(models)

# test.indx2 <- sample(1:dim(loan.dat)[1], 10000, replace = FALSE)
# fit <- glm(loan_status_bin ~., family=binomial(link='logit'),
#            data=loan.dat[test.indx2, ])
# summary(fit)

# x = model.matrix(loan_status_bin ~ ., loan.dat)[test.indx2,-1]
# y = loan.dat$loan_status_bin[test.indx2]
# fit.lasso <- cv.glmnet(x=x, y=y, family="binomial", alpha=1)
# fit.lasso
# coef.min = coef(fit.lasso, s = "lambda.min")
# coef.min
# 
# fit.enet <- cv.glmnet(x=x, y=y, family="binomial", alpha=0.5)
# fit.enet
# 
# fit.ridge <- cv.glmnet(x=x, y=y, family="binomial", alpha=0)
# fit.ridge

AUC = rep(0,n_models)
names(AUC) = models
for (m in 1:n_models) {
    
    print_str <- paste("Training model: ",
                       models[m], sep='')
    print(print_str)
    
    # save our results
    if (models[m] == 'ridge'){
      
      fit = train(loan_status_bin ~., 
                  data=loan.dat[train.indx, ], 
                  method="glmnet", metric="ROC", trControl=train.control,
                  tuneGrid=expand.grid(alpha = 0, lambda = .5 ^ (-20:20)))  
      
      fit.ridge <- fit
    } else if (models[m] == 'lasso'){
      
      fit = train(loan_status_bin ~., 
                  data=loan.dat[train.indx, ], 
                  method="glmnet", metric="ROC", trControl=train.control,
                  tuneGrid=expand.grid(alpha = 1, lambda = .5 ^ (-20:20)))      

      fit.lasso <- fit
    } else if (models[m] == 'enet'){
      
      fit = train(loan_status_bin ~., 
                  data=loan.dat[train.indx, ], 
                  method="glmnet", metric="ROC", trControl=train.control,
                  tuneGrid=expand.grid(alpha = seq(.05,.95,.05), lambda = .5 ^ (-20:20)))     

      fit.enet <- fit
    } else if (models[m] == 'adaboost'){

      fit = train(loan_status_bin ~., 
            data=loan.dat[train.indx, ], 
            method=models[m], metric="ROC", trControl=train.control)
  
      fit.adaboost <- fit
      
    } else('Unknown model type!')
    
    probs = predict(fit, loan.dat[test.indx,], type="prob")
    
    R = roc(loan.dat$loan_status_bin[test.indx], probs$Good)
    
    plot.roc(R, add=(m>1), col=m, lwd=2, main="ROC curves")
    
    legend("bottomright", legend=models, col=1:n_models, lwd=2)

    AUC[m] = R$auc
}
AUC

setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
save(fit.ridge, fit.lasso, fit.enet, file = "./models.RData")
#save(fit.ridge, fit.lasso, fit.enet, fit.adaboost, file = "./models.RData")

##     ridge     lasso      enet 
## 0.7494317 0.7507364 0.7508311

The AUC values for all three models are similar. However, elastic net model \(\alpha=0.05\) and \(\lambda=0.0001220703\) had the largest AUC. Therefore, for our final prediction model, decided to use the elastic net model with \(\alpha=0.05\) and \(\lambda=0.0001220703\) and retrained the model with all data.

x = model.matrix(loan_status_bin ~ ., loan.dat)[,-1]
y = loan.dat$loan_status_bin

final_mod = glmnet(x=x, y=y, family = 'binomial', alpha = 0.05, 
                   lambda = 0.0001220703)
#setwd('/Users/shuheimiyasaka/Google Drive/Harvard/Courses/BST 260/BST-260-Final-Project')
#save(final_mod, file = "./final_model.RData")

Results

So our final model is an elastic net model (\(\alpha=0.05\) and \(\lambda=0.0001220703\)) with the following regression coefficients:

coef(final_mod)
## 109 x 1 sparse Matrix of class "dgCMatrix"
##                                               s0
## (Intercept)                        -7.755477e+02
## loan_amnt                          -9.019996e-06
## funded_amnt                        -1.372821e-06
## int_rate                           -2.158074e-01
## gradeB                              9.012855e-02
## gradeC                              3.026705e-01
## gradeD                              6.032857e-01
## gradeE                              9.440011e-01
## gradeF                              1.378499e+00
## gradeG                              1.607011e+00
## emp_length1 year                    5.455825e-02
## emp_length10+ years                 1.131587e-01
## emp_length2 years                   7.302165e-02
## emp_length3 years                   5.104954e-02
## emp_length4 years                   7.584263e-02
## emp_length5 years                   4.969832e-02
## emp_length6 years                  -1.961420e-02
## emp_length7 years                   1.356539e-02
## emp_length8 years                   4.307889e-02
## emp_length9 years                   1.167799e-02
## emp_lengthn/a                      -1.202705e-01
## home_ownershipMORTGAGE              7.508743e-02
## home_ownershipNONE                  1.218026e-01
## home_ownershipOTHER                 2.321932e-01
## home_ownershipOWN                   4.033432e-02
## home_ownershipRENT                 -4.767680e-02
## annual_inc                          2.529573e-06
## verification_statusSource Verified -8.357074e-02
## verification_statusVerified        -3.158425e-02
## purposecredit_card                 -6.107965e-02
## purposedebt_consolidation          -1.529318e-01
## purposeeducational                  1.332428e-01
## purposehome_improvement            -1.524668e-01
## purposehouse                       -9.399033e-02
## purposemajor_purchase              -8.485275e-02
## purposemedical                     -1.956414e-01
## purposemoving                      -1.636192e-01
## purposeother                       -8.853654e-02
## purposerenewable_energy            -2.883932e-01
## purposesmall_business              -5.102216e-01
## purposevacation                    -9.262761e-02
## purposewedding                      2.826418e-01
## addr_stateAL                       -1.973950e-01
## addr_stateAR                       -5.919637e-02
## addr_stateAZ                       -1.210503e-01
## addr_stateCA                       -1.177515e-01
## addr_stateCO                        1.088124e-01
## addr_stateCT                        2.500888e-02
## addr_stateDC                        4.654147e-01
## addr_stateDE                       -4.662514e-02
## addr_stateFL                       -1.833211e-01
## addr_stateGA                       -1.614537e-02
## addr_stateHI                       -1.610241e-01
## addr_stateIA                       -1.538244e-01
## addr_stateID                        1.155041e+00
## addr_stateIL                        1.102539e-01
## addr_stateIN                       -1.250537e-01
## addr_stateKS                        1.497544e-01
## addr_stateKY                       -1.219469e-03
## addr_stateLA                       -1.542646e-01
## addr_stateMA                       -9.054951e-02
## addr_stateMD                       -1.479567e-01
## addr_stateME                        2.801215e+00
## addr_stateMI                       -9.021531e-02
## addr_stateMN                       -1.043299e-01
## addr_stateMO                       -6.335870e-02
## addr_stateMS                        6.428139e-05
## addr_stateMT                        8.925641e-02
## addr_stateNC                       -1.578348e-01
## addr_stateND                        1.508759e+00
## addr_stateNE                        7.619997e-01
## addr_stateNH                        1.847850e-01
## addr_stateNJ                       -1.405565e-01
## addr_stateNM                       -1.532665e-01
## addr_stateNV                       -2.880094e-01
## addr_stateNY                       -1.690205e-01
## addr_stateOH                       -1.725368e-02
## addr_stateOK                       -1.637056e-01
## addr_stateOR                        6.320724e-04
## addr_statePA                       -6.956881e-02
## addr_stateRI                       -8.385937e-02
## addr_stateSC                        1.253668e-01
## addr_stateSD                       -6.817322e-02
## addr_stateTN                       -1.476443e-01
## addr_stateTX                        3.390241e-02
## addr_stateUT                       -1.342979e-01
## addr_stateVA                       -1.562151e-01
## addr_stateVT                        2.305194e-01
## addr_stateWA                        1.053958e-02
## addr_stateWI                        3.220255e-02
## addr_stateWV                        1.808913e-01
## addr_stateWY                        2.795099e-01
## dti                                -1.011735e-02
## delinq_2yrs                        -3.198929e-02
## inq_last_6mths                     -8.881619e-02
## open_acc                           -3.714554e-03
## pub_rec                             4.346689e-02
## revol_bal                           2.975497e-07
## revol_util                         -1.001653e-03
## total_acc                          -1.530366e-04
## initial_list_statusw                1.095776e-01
## application_typeJOINT               2.266379e+00
## acc_now_delinq                     -1.869035e-02
## tot_coll_amt_gt0TRUE               -5.309220e-03
## tot_cur_bal                         2.643953e-07
## total_rev_hi_lim                    2.053354e-06
## perc_funded_amnt_inv               -5.164081e-01
## termShort                          -1.204291e-01
## year                                3.882085e-01